1. Data Processing

# Load the Data
knitr::opts_knit$set(root.dir = '/Users/Faiz Muthi/Downloads/data')
customers_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/customers_dataset.csv', sep=",")

products_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/products_dataset.csv', sep=",")

geolocation_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/geolocation_dataset.csv', sep=",")

order_items_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_items_dataset.csv', sep=",")

order_reviews_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_reviews_dataset.csv', sep=",")

sellers_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/sellers_dataset.csv', sep=",")

order_payments_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/order_payments_dataset.csv', sep=",")

orders_dataset <- read.csv('/Users/Faiz Muthi/Downloads/data/orders_dataset.csv', sep=",")

product_category_name_translation <- read.csv('/Users/Faiz Muthi/Downloads/data/product_category_name_translation.csv', sep=",")
# Check the structure of datasets
str(customers_dataset)
## 'data.frame':    99441 obs. of  5 variables:
##  $ customer_id             : chr  "06b8999e2fba1a1fbc88172c00ba8bc7" "18955e83d337fd6b2def6b18a428ac77" "4e7b3e00288586ebd08712fdd0374a03" "b2b6027bc5c5109e529d4dc6358b12c3" ...
##  $ customer_unique_id      : chr  "861eff4711a542e4b93843c6dd7febb0" "290c77bc529b7ac935b93aa66c333dc3" "060e732b5b29e8181a18229c7b0b2b5e" "259dac757896d24d7702b9acbbff3f3c" ...
##  $ customer_zip_code_prefix: int  14409 9790 1151 8775 13056 89254 4534 35182 81560 30575 ...
##  $ customer_city           : chr  "franca" "sao bernardo do campo" "sao paulo" "mogi das cruzes" ...
##  $ customer_state          : chr  "SP" "SP" "SP" "SP" ...
str(products_dataset)
## 'data.frame':    32951 obs. of  9 variables:
##  $ product_id                : chr  "1e9e8ef04dbcff4541ed26657ea517e5" "3aa071139cb16b67ca9e5dea641aaa2f" "96bd76ec8810374ed1b65e291975717f" "cef67bcfe19066a932b7673e239eb23d" ...
##  $ product_category_name     : chr  "perfumaria" "artes" "esporte_lazer" "bebes" ...
##  $ product_name_lenght       : int  40 44 46 27 37 60 56 56 57 36 ...
##  $ product_description_lenght: int  287 276 250 261 402 745 1272 184 163 1156 ...
##  $ product_photos_qty        : int  1 1 1 1 4 1 4 2 1 1 ...
##  $ product_weight_g          : int  225 1000 154 371 625 200 18350 900 400 600 ...
##  $ product_length_cm         : int  16 30 18 26 20 38 70 40 27 17 ...
##  $ product_height_cm         : int  10 18 9 4 17 5 24 8 13 10 ...
##  $ product_width_cm          : int  14 20 15 26 13 11 44 40 17 12 ...
str(geolocation_dataset)
## 'data.frame':    1000163 obs. of  5 variables:
##  $ geolocation_zip_code_prefix: int  1037 1046 1046 1041 1035 1012 1047 1013 1029 1011 ...
##  $ geolocation_lat            : num  -23.5 -23.5 -23.5 -23.5 -23.5 ...
##  $ geolocation_lng            : num  -46.6 -46.6 -46.6 -46.6 -46.6 ...
##  $ geolocation_city           : chr  "sao paulo" "sao paulo" "sao paulo" "sao paulo" ...
##  $ geolocation_state          : chr  "SP" "SP" "SP" "SP" ...
str(order_items_dataset)
## 'data.frame':    112650 obs. of  7 variables:
##  $ order_id           : chr  "00010242fe8c5a6d1ba2dd792cb16214" "00018f77f2f0320c557190d7a144bdd3" "000229ec398224ef6ca0657da4fc703e" "00024acbcdf0a6daa1e931b038114c75" ...
##  $ order_item_id      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ product_id         : chr  "4244733e06e7ecb4970a6e2683c13e61" "e5f2d52b802189ee658865ca93d83a8f" "c777355d18b72b67abbeef9df44fd0fd" "7634da152a4610f1595efa32f14722fc" ...
##  $ seller_id          : chr  "48436dade18ac8b2bce089ec2a041202" "dd7ddc04e1b6c2c614352b383efe2d36" "5b51032eddd242adc84c38acab88f23d" "9d7a1d34a5052409006425275ba1c2b4" ...
##  $ shipping_limit_date: chr  "2017-09-19 09:45:35" "2017-05-03 11:05:13" "2018-01-18 14:48:30" "2018-08-15 10:10:18" ...
##  $ price              : num  58.9 239.9 199 13 199.9 ...
##  $ freight_value      : num  13.3 19.9 17.9 12.8 18.1 ...
str(order_reviews_dataset)
## 'data.frame':    99224 obs. of  7 variables:
##  $ review_id              : chr  "7bc2406110b926393aa56f80a40eba40" "80e641a11e56f04c1ad469d5645fdfde" "228ce5500dc1d8e020d8d1322874b6f0" "e64fb393e7b32834bb789ff8bb30750e" ...
##  $ order_id               : chr  "73fc7af87114b39712e6da79b0a377eb" "a548910a1c6147796b98fdf73dbeba33" "f9e4b658b201a9f2ecdecbb34bed034b" "658677c97b385a9be170737859d3511b" ...
##  $ review_score           : int  4 5 5 5 5 1 5 5 5 4 ...
##  $ review_comment_title   : chr  "" "" "" "" ...
##  $ review_comment_message : chr  "" "" "" "Recebi bem antes do prazo estipulado." ...
##  $ review_creation_date   : chr  "2018-01-18 00:00:00" "2018-03-10 00:00:00" "2018-02-17 00:00:00" "2017-04-21 00:00:00" ...
##  $ review_answer_timestamp: chr  "2018-01-18 21:46:59" "2018-03-11 03:05:13" "2018-02-18 14:36:24" "2017-04-21 22:02:06" ...
str(sellers_dataset)
## 'data.frame':    3095 obs. of  4 variables:
##  $ seller_id             : chr  "3442f8959a84dea7ee197c632cb2df15" "d1b65fc7debc3361ea86b5f14c68d2e2" "ce3ad9de960102d0677a81f5d0bb7b2d" "c0f3eea2e14555b6faeea3dd58c1b1c3" ...
##  $ seller_zip_code_prefix: int  13023 13844 20031 4195 12914 20920 55325 16304 1529 80310 ...
##  $ seller_city           : chr  "campinas" "mogi guacu" "rio de janeiro" "sao paulo" ...
##  $ seller_state          : chr  "SP" "SP" "RJ" "SP" ...
str(order_payments_dataset)
## 'data.frame':    103886 obs. of  5 variables:
##  $ order_id            : chr  "b81ef226f3fe1789b1e8b2acac839d17" "a9810da82917af2d9aefd1278f1dcfa0" "25e8ea4e93396b6fa0d3dd708e76c1bd" "ba78997921bbcdc1373bb41e913ab953" ...
##  $ payment_sequential  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ payment_type        : chr  "credit_card" "credit_card" "credit_card" "credit_card" ...
##  $ payment_installments: int  8 1 1 8 2 2 1 3 6 1 ...
##  $ payment_value       : num  99.3 24.4 65.7 107.8 128.4 ...
str(orders_dataset)
## 'data.frame':    99441 obs. of  8 variables:
##  $ order_id                     : chr  "e481f51cbdc54678b7cc49136f2d6af7" "53cdb2fc8bc7dce0b6741e2150273451" "47770eb9100c2d0c44946d9cf07ec65d" "949d5b44dbf5de918fe9c16f97b45f8a" ...
##  $ customer_id                  : chr  "9ef432eb6251297304e76186b10a928d" "b0830fb4747a6c6d20dea0b8c802d7ef" "41ce2a54c0b03bf3443c3d931a367089" "f88197465ea7920adcdbec7375364d82" ...
##  $ order_status                 : chr  "delivered" "delivered" "delivered" "delivered" ...
##  $ order_purchase_timestamp     : chr  "2017-10-02 10:56:33" "2018-07-24 20:41:37" "2018-08-08 08:38:49" "2017-11-18 19:28:06" ...
##  $ order_approved_at            : chr  "2017-10-02 11:07:15" "2018-07-26 03:24:27" "2018-08-08 08:55:23" "2017-11-18 19:45:59" ...
##  $ order_delivered_carrier_date : chr  "2017-10-04 19:55:00" "2018-07-26 14:31:00" "2018-08-08 13:50:00" "2017-11-22 13:39:59" ...
##  $ order_delivered_customer_date: chr  "2017-10-10 21:25:13" "2018-08-07 15:27:45" "2018-08-17 18:06:29" "2017-12-02 00:28:42" ...
##  $ order_estimated_delivery_date: chr  "2017-10-18 00:00:00" "2018-08-13 00:00:00" "2018-09-04 00:00:00" "2017-12-15 00:00:00" ...
str(product_category_name_translation)
## 'data.frame':    71 obs. of  2 variables:
##  $ product_category_name        : chr  "beleza_saude" "informatica_acessorios" "automotivo" "cama_mesa_banho" ...
##  $ product_category_name_english: chr  "health_beauty" "computers_accessories" "auto" "bed_bath_table" ...
# Summary statistics
summary(customers_dataset)
##  customer_id        customer_unique_id customer_zip_code_prefix
##  Length:99441       Length:99441       Min.   : 1003           
##  Class :character   Class :character   1st Qu.:11347           
##  Mode  :character   Mode  :character   Median :24416           
##                                        Mean   :35137           
##                                        3rd Qu.:58900           
##                                        Max.   :99990           
##  customer_city      customer_state    
##  Length:99441       Length:99441      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
summary(products_dataset)
##   product_id        product_category_name product_name_lenght
##  Length:32951       Length:32951          Min.   : 5.00      
##  Class :character   Class :character      1st Qu.:42.00      
##  Mode  :character   Mode  :character      Median :51.00      
##                                           Mean   :48.48      
##                                           3rd Qu.:57.00      
##                                           Max.   :76.00      
##                                           NA's   :610        
##  product_description_lenght product_photos_qty product_weight_g
##  Min.   :   4.0             Min.   : 1.000     Min.   :    0   
##  1st Qu.: 339.0             1st Qu.: 1.000     1st Qu.:  300   
##  Median : 595.0             Median : 1.000     Median :  700   
##  Mean   : 771.5             Mean   : 2.189     Mean   : 2276   
##  3rd Qu.: 972.0             3rd Qu.: 3.000     3rd Qu.: 1900   
##  Max.   :3992.0             Max.   :20.000     Max.   :40425   
##  NA's   :610                NA's   :610        NA's   :2       
##  product_length_cm product_height_cm product_width_cm
##  Min.   :  7.00    Min.   :  2.00    Min.   :  6.0   
##  1st Qu.: 18.00    1st Qu.:  8.00    1st Qu.: 15.0   
##  Median : 25.00    Median : 13.00    Median : 20.0   
##  Mean   : 30.82    Mean   : 16.94    Mean   : 23.2   
##  3rd Qu.: 38.00    3rd Qu.: 21.00    3rd Qu.: 30.0   
##  Max.   :105.00    Max.   :105.00    Max.   :118.0   
##  NA's   :2         NA's   :2         NA's   :2
summary(geolocation_dataset)
##  geolocation_zip_code_prefix geolocation_lat  geolocation_lng  
##  Min.   : 1001               Min.   :-36.61   Min.   :-101.47  
##  1st Qu.:11075               1st Qu.:-23.60   1st Qu.: -48.57  
##  Median :26530               Median :-22.92   Median : -46.64  
##  Mean   :36574               Mean   :-21.18   Mean   : -46.39  
##  3rd Qu.:63504               3rd Qu.:-19.98   3rd Qu.: -43.77  
##  Max.   :99990               Max.   : 45.07   Max.   : 121.11  
##  geolocation_city   geolocation_state 
##  Length:1000163     Length:1000163    
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
summary(order_items_dataset)
##    order_id         order_item_id     product_id         seller_id        
##  Length:112650      Min.   : 1.000   Length:112650      Length:112650     
##  Class :character   1st Qu.: 1.000   Class :character   Class :character  
##  Mode  :character   Median : 1.000   Mode  :character   Mode  :character  
##                     Mean   : 1.198                                        
##                     3rd Qu.: 1.000                                        
##                     Max.   :21.000                                        
##  shipping_limit_date     price         freight_value   
##  Length:112650       Min.   :   0.85   Min.   :  0.00  
##  Class :character    1st Qu.:  39.90   1st Qu.: 13.08  
##  Mode  :character    Median :  74.99   Median : 16.26  
##                      Mean   : 120.65   Mean   : 19.99  
##                      3rd Qu.: 134.90   3rd Qu.: 21.15  
##                      Max.   :6735.00   Max.   :409.68
summary(order_reviews_dataset)
##   review_id           order_id          review_score   review_comment_title
##  Length:99224       Length:99224       Min.   :1.000   Length:99224        
##  Class :character   Class :character   1st Qu.:4.000   Class :character    
##  Mode  :character   Mode  :character   Median :5.000   Mode  :character    
##                                        Mean   :4.086                       
##                                        3rd Qu.:5.000                       
##                                        Max.   :5.000                       
##  review_comment_message review_creation_date review_answer_timestamp
##  Length:99224           Length:99224         Length:99224           
##  Class :character       Class :character     Class :character       
##  Mode  :character       Mode  :character     Mode  :character       
##                                                                     
##                                                                     
## 
summary(sellers_dataset)
##   seller_id         seller_zip_code_prefix seller_city       
##  Length:3095        Min.   : 1001          Length:3095       
##  Class :character   1st Qu.: 7094          Class :character  
##  Mode  :character   Median :14940          Mode  :character  
##                     Mean   :32291                            
##                     3rd Qu.:64553                            
##                     Max.   :99730                            
##  seller_state      
##  Length:3095       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(order_payments_dataset)
##    order_id         payment_sequential payment_type       payment_installments
##  Length:103886      Min.   : 1.000     Length:103886      Min.   : 0.000      
##  Class :character   1st Qu.: 1.000     Class :character   1st Qu.: 1.000      
##  Mode  :character   Median : 1.000     Mode  :character   Median : 1.000      
##                     Mean   : 1.093                        Mean   : 2.853      
##                     3rd Qu.: 1.000                        3rd Qu.: 4.000      
##                     Max.   :29.000                        Max.   :24.000      
##  payment_value     
##  Min.   :    0.00  
##  1st Qu.:   56.79  
##  Median :  100.00  
##  Mean   :  154.10  
##  3rd Qu.:  171.84  
##  Max.   :13664.08
summary(orders_dataset)
##    order_id         customer_id        order_status      
##  Length:99441       Length:99441       Length:99441      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##  order_purchase_timestamp order_approved_at  order_delivered_carrier_date
##  Length:99441             Length:99441       Length:99441                
##  Class :character         Class :character   Class :character            
##  Mode  :character         Mode  :character   Mode  :character            
##  order_delivered_customer_date order_estimated_delivery_date
##  Length:99441                  Length:99441                 
##  Class :character              Class :character             
##  Mode  :character              Mode  :character
# Handle Missing Values
# Drop rows with missing values
customers_dataset <- na.omit(customers_dataset)
products_dataset <- na.omit(products_dataset)
geolocation_dataset <- na.omit(geolocation_dataset)
order_items_dataset <- na.omit(order_items_dataset)
order_reviews_dataset <- na.omit(order_reviews_dataset)
sellers_dataset <- na.omit(sellers_dataset)
order_payments_dataset <- na.omit(order_payments_dataset)
orders_dataset <- na.omit(orders_dataset)

# Convert date columns to Date type
orders_dataset$order_purchase_timestamp <- as.Date(orders_dataset$order_purchase_timestamp)
orders_dataset$order_delivered_customer_date <- as.Date(orders_dataset$order_delivered_customer_date)
orders_dataset$order_approved_at <- as.Date(orders_dataset$order_approved_at)
orders_dataset$order_delivered_carrier_date <- as.Date(orders_dataset$order_delivered_carrier_date)
orders_dataset$order_estimated_delivery_date <- as.Date(orders_dataset$order_estimated_delivery_date)

2. Basic Analytics

library(ggplot2)
# Customers Dataset
# Distribution of customer zip codes
ggplot(customers_dataset, aes(x = customer_zip_code_prefix)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  labs(title = "Distribution of Customer Zip Codes", x = "Zip Code Prefix", y = "Frequency")

# Count of customers by state
ggplot(customers_dataset, aes(x = customer_state)) +
  geom_bar(fill = "green", color = "black") +
  labs(title = "Number of Customers by State", x = "State", y = "Count")

# Products Dataset
# Distribution of product weights
ggplot(products_dataset, aes(x = product_weight_g)) +
  geom_histogram(bins = 30, fill = "orange", color = "black") +
  labs(title = "Distribution of Product Weights", x = "Weight (g)", y = "Frequency")

# Count of products by category
ggplot(products_dataset, aes(x = product_category_name)) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Number of Products by Category", x = "Product Category", y = "Count")

# Geolocation Dataset
# Distribution of latitude and longitude
ggplot(geolocation_dataset, aes(x = geolocation_lng, y = geolocation_lat)) +
  geom_point(alpha = 0.5, color = "red") +
  labs(title = "Geolocation of Customers", x = "Longitude", y = "Latitude")

# Order Items Dataset
# Distribution of prices
ggplot(order_items_dataset, aes(x = price)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +
  labs(title = "Distribution of Order Prices", x = "Price", y = "Frequency")

# Count of items per order
ggplot(order_items_dataset, aes(x = order_id)) +
  geom_bar(fill = "green", color = "black") +
  labs(title = "Number of Items per Order", x = "Order ID", y = "Count")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Order Reviews Dataset
# Distribution of review scores
ggplot(order_reviews_dataset, aes(x = review_score)) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Distribution of Review Scores", x = "Review Score", y = "Count")

# Average review score per order
order_reviews_avg <- order_reviews_dataset %>%
  group_by(order_id) %>%
  summarise(avg_review_score = mean(review_score, na.rm = TRUE))

ggplot(order_reviews_avg, aes(x = avg_review_score)) +
  geom_histogram(bins = 10, fill = "orange", color = "black") +
  labs(title = "Average Review Score per Order", x = "Average Review Score", y = "Frequency")

# Sellers Dataset
# Count of sellers by state
ggplot(sellers_dataset, aes(x = seller_state)) +
  geom_bar(fill = "blue", color = "black") +
  labs(title = "Number of Sellers by State", x = "State", y = "Count")

# Order Payments Dataset
# Distribution of payment values
ggplot(order_payments_dataset, aes(x = payment_value)) +
  geom_histogram(bins = 30, fill = "green", color = "black") +
  labs(title = "Distribution of Payment Values", x = "Payment Value", y = "Frequency")

# Payment types
ggplot(order_payments_dataset, aes(x = payment_type)) +
  geom_bar(fill = "purple", color = "black") +
  labs(title = "Count of Payment Types", x = "Payment Type", y = "Count")

# Orders Dataset
# Distribution of order statuses
ggplot(orders_dataset, aes(x = order_status)) +
  geom_bar(fill = "blue", color = "black") +
  labs(title = "Count of Order Statuses", x = "Order Status", y = "Count")

# Delivery times
orders_dataset$delivery_time <- as.numeric(difftime(orders_dataset$order_delivered_customer_date, orders_dataset$order_purchase_timestamp, units = "days"))
ggplot(orders_dataset, aes(x = delivery_time)) +
  geom_histogram(bins = 30, fill = "orange", color = "black") +
  labs(title = "Distribution of Delivery Times", x = "Delivery Time (days)", y = "Frequency")
## Warning: Removed 2965 rows containing non-finite outside the scale range
## (`stat_bin()`).

3. Advanced Analytics

library(dplyr)
# Join datasets on order_id
merged_dataset_order <- orders_dataset %>%
  left_join(order_items_dataset, by = "order_id")
merged_dataset_order
# Forecasting Sales

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Convert order_purchase_timestamp to Date type
merged_dataset_order$order_purchase_timestamp <- as.Date(merged_dataset_order$order_purchase_timestamp)

# Aggregate sales by month
monthly_sales <- merged_dataset_order %>%
  group_by(month = floor_date(order_purchase_timestamp, "month")) %>%
  summarise(total_sales = sum(price, na.rm = TRUE))


# Define the start date and frequency
start_date <- as.Date(paste0(year(min(monthly_sales$month)), "-", month(min(monthly_sales$month)), "-01"))
frequency <- 12  # Monthly data

# Create a time series object
sales_ts <- ts(monthly_sales$total_sales, start = c(year(start_date), month(start_date)), frequency = frequency)

# Apply Holt-Winters method
holt_winters_model <- HoltWinters(sales_ts)

# Forecast the next 12 months
forecast_values <- forecast(holt_winters_model, h = 12)

# Calculate end of the series
end_date <- seq.Date(from = start_date, by = "month", length.out = length(sales_ts))

# Generate future dates
forecast_dates <- seq.Date(from = end_date[length(end_date)] + months(1), by = "month", length.out = 12)
# Convert forecast to data frame for ggplot
forecast_df <- data.frame(
  Date = forecast_dates,
  Forecast = as.numeric(forecast_values$mean),
  Lower_80 = as.numeric(forecast_values$lower[,1]),
  Upper_80 = as.numeric(forecast_values$upper[,1]),
  Lower_95 = as.numeric(forecast_values$lower[,2]),
  Upper_95 = as.numeric(forecast_values$upper[,2])
)
forecast_df
# Plot the historical data and forecast
ggplot() +
  geom_line(data = monthly_sales, aes(x = month, y = total_sales), color = "blue") +
  geom_line(data = forecast_df, aes(x = Date, y = Forecast), color = "red") +
  geom_ribbon(data = forecast_df, aes(x = Date, ymin = Lower_95, ymax = Upper_95), alpha = 0.2) +
  labs(title = "Sales Forecast from 2017 using Holt-Winters Method", x = "Date", y = "Total Sales") +
  theme_minimal()

# Geoanalysis (Using geolocation_dataset):
# install.packages("rnaturalearthdata")
library(rnaturalearth)
library(rnaturalearthdata)
## 
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
geolocation_dataset$geolocation_lat <- as.numeric(geolocation_dataset$geolocation_lat)
geolocation_dataset$geolocation_lng <- as.numeric(geolocation_dataset$geolocation_lng)

# Convert to sf object
geolocation_sf <- st_as_sf(geolocation_dataset, 
                           coords = c("geolocation_lng", "geolocation_lat"), 
                           crs = 4326)

# Aggregate data 
state_data <- geolocation_sf %>%
  group_by(geolocation_state) %>%
  summarise(geometry = st_union(geometry), count = n())


# Get the world map
world <- ne_countries(scale = "medium", returnclass = "sf")

# Plot the map with geolocation data
ggplot(data = world) +
  geom_sf(fill = "lightgray") +
  geom_sf(data = state_data, aes(size = count, color = count), alpha = 0.7) +
  scale_color_viridis_c() +
  labs(title = "Geolocation of Customers by State",
       x = "Longitude",
       y = "Latitude",
       size = "Number of Customers",
       color = "Number of Customers") +
  theme_minimal()

# Outlier Detection (Using order_items_dataset):

# Boxplot to detect outliers
ggplot(order_items_dataset, aes(x = "", y = price)) +
  geom_boxplot() +
  labs(title = "Boxplot of Order Prices", y = "Price")

# Identifying outliers
order_items_dataset <- order_items_dataset %>%
  mutate(is_outlier = price > quantile(price, 0.95, na.rm = TRUE))

# Visualize outliers
ggplot(order_items_dataset, aes(x = price, color = as.factor(is_outlier))) +
  geom_histogram(bins = 30) +
  labs(title = "Outlier Detection in Order Prices")

# RFM Analysis (Using orders_dataset):
library(cluster)

merged_dataset_order_RFM <- orders_dataset %>%
  left_join(order_items_dataset, by = "order_id")
merged_dataset_order_RFM
# Compute RFM metrics
rfm_data <- merged_dataset_order_RFM %>%
  group_by(customer_id) %>%
  summarise(
    Recency = as.numeric(difftime(Sys.Date(), max(order_purchase_timestamp), units = "days")),
    Frequency = n(),
    Monetary = sum(price, na.rm = TRUE)
  )

# K-means clustering
set.seed(123)
rfm_clusters <- kmeans(rfm_data %>% select(Recency, Frequency, Monetary), centers = 4)
rfm_data$cluster <- rfm_clusters$cluster

# Visualize RFM clusters
ggplot(rfm_data, aes(x = Recency, y = Monetary, color = as.factor(cluster))) +
  geom_point() +
  labs(title = "RFM Clustering")

4. Generate a Model

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
library(ROSE)  # For balancing the dataset
## Loaded ROSE 0.0-4
merged_dataset_order_generate <- orders_dataset %>%
  left_join(order_items_dataset, by = "order_id")

# Convert columns to appropriate types
merged_dataset_order_generate <- merged_dataset_order_generate %>%
  mutate(
    order_purchase_timestamp = as.Date(order_purchase_timestamp),
    order_approved_at = as.Date(order_approved_at),
    price = as.numeric(price),
    freight_value = as.numeric(freight_value),
    order_status = as.factor(order_status) 
  )

# Remove rows with missing values or impute them
merged_dataset_order_generate <- na.omit(merged_dataset_order_generate)

# Balance the dataset
balanced_data <- ovun.sample(order_status ~ price + freight_value, 
                              data = merged_dataset_order_generate, 
                              method = "both",  # Both oversampling and undersampling
                              p = 0.5,  # Desired proportion
                              seed = 123)$data

# Split the balanced data
set.seed(123)
train_index <- createDataPartition(balanced_data$order_status, p = 0.7, list = FALSE)
train_data <- balanced_data[train_index, ]
test_data <- balanced_data[-train_index, ]

# Train Random Forest model
rf_model <- randomForest(order_status ~ price + freight_value, data = train_data)

# Make predictions
predictions <- predict(rf_model, test_data)

# Evaluate the model
confusionMatrix(predictions, test_data$order_status)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  delivered canceled
##   delivered     16599        0
##   canceled         11    16443
##                                           
##                Accuracy : 0.9997          
##                  95% CI : (0.9994, 0.9998)
##     No Information Rate : 0.5025          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9993          
##                                           
##  Mcnemar's Test P-Value : 0.002569        
##                                           
##             Sensitivity : 0.9993          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9993          
##              Prevalence : 0.5025          
##          Detection Rate : 0.5022          
##    Detection Prevalence : 0.5022          
##       Balanced Accuracy : 0.9997          
##                                           
##        'Positive' Class : delivered       
## 

according to the above result, Sensitivity and Positive Predictive Value for the “delivered” class are very high, but the model struggles with predicting other classes